Read and format input data

##Create a map of the US Coastline and plot landfall points

##            ID                Name  state.name         x        y  Max.Wind Year
## 1    AL011900             UNNAMED       texas -95.17883 29.16898 119.35583 1900
## 1008 AL031903             UNNAMED     florida -85.58116 30.06231  79.84683 1903
## 1775 AL061906             UNNAMED mississippi -88.68922 30.37844  94.96951 1906
## 1864 AL081906             UNNAMED     florida -80.79391 25.15913 101.75561 1906
## 2405 AL021909             UNNAMED       texas -97.21436 26.10459  84.97311 1909
## 2463 AL041909             UNNAMED       texas -95.36640 28.94130  97.46856 1909
##      Month cat
## 1       09   3
## 1008    09   1
## 1775    09   1
## 1864    10   2
## 2405    06   1
## 2463    07   2

Map Hurricane Landfall Points

geography <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showland = TRUE,
  landcolor = toRGB("gray95"),
  subunitcolor = toRGB("gray85"),
  countrycolor = toRGB("gray85"),
  countrywidth = 0.5,
  subunitwidth = 0.5
)

storm_colors <- c("blue", "green", "yellow", "orange", "red")

storm_plot <- plot_geo(storm_landfall, lat = ~y, lon = ~x) %>%
  add_markers(
   text = ~paste(Year,"<br />",Name,"<br />","Cat:",cat),hoverinfo="text",
    color = ~storm_landfall$cat, size = ~storm_landfall$Max.Wind,
   colors = storm_colors,
    marker = list(colorbar = list(len = 0.2, title = "Landfall Category"))
  ) %>%
  layout(
    title = 'US Hurricane Landfall points', geo = geography
  )
  
storm_plot

Boxplot of the maximum wind speed at landfall for selected states

storm_landfall1<-subset(storm_landfall,state.name %in% c("florida","louisiana","texas","mississippi", "alabama", "south carolina", "north carolina"))

options(warn = -1) 
storm_box <- plot_ly(storm_landfall1, y = ~Max.Wind, color = ~state.name, type = "box") %>%
  layout(title='Max Wind speed at landfall', yaxis = list(title = 'Max Wind speed [mph]'))
storm_box

Plot of storms by Category

This looked better in Tableau than I could have made it in R, so I am reusing this reom my last project.

Plot of Hurricanes by Year and Category

Test of Random Forest vs. Linear Regression to predict number of storms per year

# test linear regression against random forest
lm_model <- lm(storm_num ~ Time+Category+Latitude+Longitude+Maximum.Wind+Year+Month+timestamp , data = train_set)

rf_model = randomForest(storm_num ~ Category+Latitude+Longitude+Maximum.Wind+Year+Month+timestamp, data = train_set, ntree = 5)

test_set$scores = predict(rf_model, newdata = test_set)

summary(test_set)
##       ID                Name               Time              Category     
##  Length:11278       Length:11278       Length:11278       Min.   :0.0000  
##  Class :character   Class :character   Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.0000  
##                                                           Mean   :0.2757  
##                                                           3rd Qu.:0.0000  
##                                                           Max.   :5.0000  
##     Latitude       Longitude        Maximum.Wind    Minimum.Pressure 
##  Min.   : 7.20   Min.   :-108.90   Min.   :-99.00   Min.   :-999.00  
##  1st Qu.:18.60   1st Qu.: -80.60   1st Qu.: 30.00   1st Qu.:-999.00  
##  Median :26.10   Median : -66.50   Median : 45.00   Median :-999.00  
##  Mean   :26.71   Mean   : -64.73   Mean   : 49.41   Mean   : -78.72  
##  3rd Qu.:32.90   3rd Qu.: -50.73   3rd Qu.: 65.00   3rd Qu.: 997.00  
##  Max.   :75.50   Max.   :   0.00   Max.   :160.00   Max.   :1024.00  
##       Year         Month             timestamp                  
##  Min.   :1900   Length:11278       Min.   :1900-08-28 06:00:00  
##  1st Qu.:1939   Class :character   1st Qu.:1939-08-18 19:30:00  
##  Median :1969   Mode  :character   Median :1969-09-20 09:00:00  
##  Mean   :1966                      Mean   :1966-03-09 10:59:12  
##  3rd Qu.:1995                      3rd Qu.:1995-09-21 04:30:00  
##  Max.   :2015                      Max.   :2015-11-13 12:00:00  
##      YYYYMM         storm_num          scores      
##  Min.   :190008   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.:193908   1st Qu.: 4.000   1st Qu.: 4.000  
##  Median :196909   Median : 7.000   Median : 7.041  
##  Mean   :196559   Mean   : 8.005   Mean   : 8.015  
##  3rd Qu.:199509   3rd Qu.:11.000   3rd Qu.:11.076  
##  Max.   :201511   Max.   :31.000   Max.   :31.000
#get residuals
lm_exp <- DALEX::explain(lm_model, label = "lm", data = train_set, y = train_set$storm_num)
## Preparation of a new explainer is initiated
##   -> model label       :  lm 
##   -> data              :  28549  rows  13  cols 
##   -> target variable   :  28549  values 
##   -> predict function  :  yhat.lm  will be used (  default  )
##   -> predicted values  :  numerical, min =  -3.417257 , mean =  8.023048 , max =  25  
##   -> model_info        :  package stats , ver. 3.6.3 , task regression (  default  ) 
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -11.96225 , mean =  -5.655027e-09 , max =  19.263  
##   A new explainer has been created! 
rf_exp <- DALEX::explain(rf_model, label = "rf", data = train_set, y = train_set$storm_num)
## Preparation of a new explainer is initiated
##   -> model label       :  rf 
##   -> data              :  28549  rows  13  cols 
##   -> target variable   :  28549  values 
##   -> predict function  :  yhat.randomForest  will be used (  default  )
##   -> predicted values  :  numerical, min =  1 , mean =  8.024476 , max =  31  
##   -> model_info        :  package randomForest , ver. 4.6.14 , task regression (  default  ) 
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -6.420833 , mean =  -0.001428336 , max =  9.68  
##   A new explainer has been created! 
lm_mr <- model_residual(lm_exp)
rf_mr <- model_residual(rf_exp)

# Compare accuracy of Random Forest vs. Linear Regression. Random Forest is more accurate
plot(rf_mr, lm_mr, type = "prediction", abline = TRUE)

predictions <- rf_model %>% predict(test_set)
RMSE <-  RMSE(predictions, test_set$storm_num)
R2 <- R2(predictions, test_set$storm_num)

# Print Root Mean Square Error of the prediction
RMSE
## [1] 1.05466
# Print R-Squared
R2
## [1] 0.963336